candy_url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/candy-power-ranking/candy-data.csv"
candy <- read.csv(candy_url, row.names = 1)
head(candy)
## chocolate fruity caramel peanutyalmondy nougat crispedricewafer
## 100 Grand 1 0 1 0 0 1
## 3 Musketeers 1 0 0 0 1 0
## One dime 0 0 0 0 0 0
## One quarter 0 0 0 0 0 0
## Air Heads 0 1 0 0 0 0
## Almond Joy 1 0 0 1 0 0
## hard bar pluribus sugarpercent pricepercent winpercent
## 100 Grand 0 1 0 0.732 0.860 66.97173
## 3 Musketeers 0 1 0 0.604 0.511 67.60294
## One dime 0 0 0 0.011 0.116 32.26109
## One quarter 0 0 0 0.011 0.511 46.11650
## Air Heads 0 0 0 0.906 0.511 52.34146
## Almond Joy 0 1 0 0.465 0.767 50.34755
Q1. How many different candy types are in this dataset?
num_candy <- nrow(candy)
sum(num_candy)
## [1] 85
Q2. How many fruity andy types are in the dataset?
num_fruity_candy <- sum(candy$fruity)
sum(candy$fruity)
## [1] 38
Q3. What is your favorite candy in the dataset and what is it’s winpercent value?
fav_candy <- "Twix"
fav_winpercent <- candy[fav_candy, ]$winpercent
sum(fav_winpercent)
## [1] 81.64291
Q4. What is the winpercent value for Kit Kat?
kit_kat <- "Kit Kat"
kit_kat_winpercent <- candy[kit_kat, ]$winpercent
sum(kit_kat_winpercent)
## [1] 76.7686
Q5. What is the winpercent value for Tootsie Roll Snack Bars?
tootsie_roll <- "Tootsie Roll Snack Bars"
tootsie_roll_winpercent <- candy[tootsie_roll, ]$winpercent
sum(tootsie_roll_winpercent)
## [1] 49.6535
Now we use “skimr”
library("skimr")
skim(candy)
| Name | candy |
| Number of rows | 85 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| chocolate | 0 | 1 | 0.44 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| fruity | 0 | 1 | 0.45 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| caramel | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| peanutyalmondy | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| nougat | 0 | 1 | 0.08 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| crispedricewafer | 0 | 1 | 0.08 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| hard | 0 | 1 | 0.18 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| bar | 0 | 1 | 0.25 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| pluribus | 0 | 1 | 0.52 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| sugarpercent | 0 | 1 | 0.48 | 0.28 | 0.01 | 0.22 | 0.47 | 0.73 | 0.99 | ▇▇▇▇▆ |
| pricepercent | 0 | 1 | 0.47 | 0.29 | 0.01 | 0.26 | 0.47 | 0.65 | 0.98 | ▇▇▇▇▆ |
| winpercent | 0 | 1 | 50.32 | 14.71 | 22.45 | 39.14 | 47.83 | 59.86 | 84.18 | ▃▇▆▅▂ |
Q6. Is there any variable/column that looks to be on a different scale to the majority of the other columns in the dataset?
skim_output <- skim(candy)
different_scale <- skim_output$skim$variable[skim_output$skim$type != "logical" & skim_output$skim$type != "integer"]
## Warning: Unknown or uninitialised column: `skim`.
## Unknown or uninitialised column: `skim`.
## Unknown or uninitialised column: `skim`.
print(different_scale)
## NULL
Q7. What do you think a zero and one represent for the candy$chocolate column?
chocolate_variable_type <- skim_output$skim$type[skim_output$skim$variable == "chocolate"]
## Warning: Unknown or uninitialised column: `skim`.
## Unknown or uninitialised column: `skim`.
print(chocolate_variable_type)
## NULL
Q8. Plot a histogram of winpercent values
hist(candy$winpercent, main = "Histogram of Winpercent Values", xlab = "Winpercent")
Q9. Is the distribution of winpercent values symmetrical?
Not symmetrical. Right skewed distribution.
Q10. Is the center of the distribution above or below 50%?
winpercent_mean <- mean(candy$winpercent)
sum(winpercent_mean)
## [1] 50.31676
Q11. On average is chocolate candy higher or lower ranked than fruit candy?
chocolate_mean <- mean(candy$winpercent[as.logical(candy$chocolate)])
fruity_mean <- mean(candy$winpercent[as.logical(candy$fruity)])
print(chocolate_mean)
## [1] 60.92153
print(fruity_mean)
## [1] 44.11974
Q12. Is this difference statistically significant?
t_test_result <- t.test(candy$winpercent[as.logical(candy$chocolate)],
candy$winpercent[as.logical(candy$fruity)])
print( t_test_result$p.value)
## [1] 2.871378e-08
Q13. What are the five least liked candy types in this set?
least_liked_candy <- head(candy[order(candy$winpercent), ], n = 5)
print(least_liked_candy)
## chocolate fruity caramel peanutyalmondy nougat
## Nik L Nip 0 1 0 0 0
## Boston Baked Beans 0 0 0 1 0
## Chiclets 0 1 0 0 0
## Super Bubble 0 1 0 0 0
## Jawbusters 0 1 0 0 0
## crispedricewafer hard bar pluribus sugarpercent pricepercent
## Nik L Nip 0 0 0 1 0.197 0.976
## Boston Baked Beans 0 0 0 1 0.313 0.511
## Chiclets 0 0 0 1 0.046 0.325
## Super Bubble 0 0 0 0 0.162 0.116
## Jawbusters 0 1 0 1 0.093 0.511
## winpercent
## Nik L Nip 22.44534
## Boston Baked Beans 23.41782
## Chiclets 24.52499
## Super Bubble 27.30386
## Jawbusters 28.12744
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
least_liked_candy_dplyr <- candy %>% arrange(winpercent) %>% head(5)
print(least_liked_candy_dplyr)
## chocolate fruity caramel peanutyalmondy nougat
## Nik L Nip 0 1 0 0 0
## Boston Baked Beans 0 0 0 1 0
## Chiclets 0 1 0 0 0
## Super Bubble 0 1 0 0 0
## Jawbusters 0 1 0 0 0
## crispedricewafer hard bar pluribus sugarpercent pricepercent
## Nik L Nip 0 0 0 1 0.197 0.976
## Boston Baked Beans 0 0 0 1 0.313 0.511
## Chiclets 0 0 0 1 0.046 0.325
## Super Bubble 0 0 0 0 0.162 0.116
## Jawbusters 0 1 0 1 0.093 0.511
## winpercent
## Nik L Nip 22.44534
## Boston Baked Beans 23.41782
## Chiclets 24.52499
## Super Bubble 27.30386
## Jawbusters 28.12744
Q14. What are the top 5 all-time favorite candy types out of this set?
top_favorite_candy <- tail(candy[order(candy$winpercent), ], n = 5)
print(top_favorite_candy)
## chocolate fruity caramel peanutyalmondy nougat
## Snickers 1 0 1 1 1
## Kit Kat 1 0 0 0 0
## Twix 1 0 1 0 0
## Reese's Miniatures 1 0 0 1 0
## Reese's Peanut Butter cup 1 0 0 1 0
## crispedricewafer hard bar pluribus sugarpercent
## Snickers 0 0 1 0 0.546
## Kit Kat 1 0 1 0 0.313
## Twix 1 0 1 0 0.546
## Reese's Miniatures 0 0 0 0 0.034
## Reese's Peanut Butter cup 0 0 0 0 0.720
## pricepercent winpercent
## Snickers 0.651 76.67378
## Kit Kat 0.511 76.76860
## Twix 0.906 81.64291
## Reese's Miniatures 0.279 81.86626
## Reese's Peanut Butter cup 0.651 84.18029
top_favorite_candy_dplyr <- candy %>% arrange(desc(winpercent)) %>% head(5)
print(top_favorite_candy_dplyr)
## chocolate fruity caramel peanutyalmondy nougat
## Reese's Peanut Butter cup 1 0 0 1 0
## Reese's Miniatures 1 0 0 1 0
## Twix 1 0 1 0 0
## Kit Kat 1 0 0 0 0
## Snickers 1 0 1 1 1
## crispedricewafer hard bar pluribus sugarpercent
## Reese's Peanut Butter cup 0 0 0 0 0.720
## Reese's Miniatures 0 0 0 0 0.034
## Twix 1 0 1 0 0.546
## Kit Kat 1 0 1 0 0.313
## Snickers 0 0 1 0 0.546
## pricepercent winpercent
## Reese's Peanut Butter cup 0.651 84.18029
## Reese's Miniatures 0.279 81.86626
## Twix 0.906 81.64291
## Kit Kat 0.511 76.76860
## Snickers 0.651 76.67378
Q15. Make a first barplot of candy ranking based on winpercent values.
library(ggplot2)
ggplot(candy) +
aes(winpercent, reorder(rownames(candy), winpercent)) +
geom_bar(stat = "identity") +
labs(x = "Winpercent", y = "Candy Type") +
ggtitle("Candy Ranking Based on Winpercent Values (Sorted)")
# Set up color vector
my_cols <- rep("black", nrow(candy))
my_cols[as.logical(candy$chocolate)] <- "chocolate"
my_cols[as.logical(candy$bar)] <- "brown"
my_cols[as.logical(candy$fruity)] <- "red"
# Plot barplot with specified colors
ggplot(candy) +
aes(winpercent, reorder(rownames(candy), winpercent)) +
geom_col(fill = my_cols) +
labs(x = "Winpercent", y = "Candy Type") +
ggtitle("Candy Ranking Based on Winpercent Values (Sorted) with Candy Type Colors")
Q17. what is the worst ranked chocolate candy?
Charleston Chew
Q18. What is the best ranked fruity candy?
Nik L Nip
library(ggrepel)
ggplot(candy) +
aes(winpercent, pricepercent, label = rownames(candy)) +
geom_point(col = my_cols) +
geom_text_repel(col = my_cols, size = 3.3, max.overlaps = 5)
## Warning: ggrepel: 53 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Q19. Which candy type is the highest ranked in terms of winpercent for the least money - i.e. offers the most bang for your buck?
candy$win_to_price_ratio <- candy$winpercent / candy$pricepercent
best_value_candy <- candy[which.max(candy$win_to_price_ratio), "win_to_price_ratio"]
cat("Q19. The candy type offering the most bang for your buck is:", rownames(best_value_candy), "\n")
## Q19. The candy type offering the most bang for your buck is:
print(rownames(best_value_candy))
## NULL
Q20. What are the top 5 most expensive candy types in the dataset and of these which is the least popular?
ord <- order(candy$pricepercent, decreasing = TRUE)
top_expensive_candies <- head(candy[ord, c("pricepercent", "winpercent")], n = 5)
cat("Q20. The top 5 most expensive candy types in the dataset are:\n")
## Q20. The top 5 most expensive candy types in the dataset are:
print(top_expensive_candies)
## pricepercent winpercent
## Nik L Nip 0.976 22.44534
## Nestle Smarties 0.976 37.88719
## Ring pop 0.965 35.29076
## Hershey's Krackel 0.918 62.28448
## Hershey's Milk Chocolate 0.918 56.49050
least_popular_expensive_candy <- top_expensive_candies[which.min(top_expensive_candies$winpercent), ]
cat("The least popular candy among the top 5 most expensive candy types is:", rownames(least_popular_expensive_candy), "\n")
## The least popular candy among the top 5 most expensive candy types is: Nik L Nip
library(ggplot2)
ggplot(candy) +
aes(pricepercent, reorder(rownames(candy), pricepercent)) +
geom_segment(aes(yend = reorder(rownames(candy), pricepercent),
xend = 0), col = "gray40") +
geom_point()
library(corrplot)
## corrplot 0.92 loaded
cij <- cor(candy)
corrplot(cij)
Q22. Examining this plot what two variables are anti-correlated (i.e. have minus values)?
cij <- cor(candy)
min_corr <- min(cij)
anti_corr_indices <- which(cij == min_corr, arr.ind = TRUE)
anti_corr_variables <- rownames(cij)[anti_corr_indices[1, 1]]
anti_corr_variables2 <- colnames(cij)[anti_corr_indices[1, 2]]
cat("Q22. The two variables that are anti-correlated are:", anti_corr_variables, "and", anti_corr_variables2, "\n")
## Q22. The two variables that are anti-correlated are: fruity and chocolate
Q23. Similarly, what two variables are most positively correlated?
max_corr <- max(cij)
pos_corr_indices <- which(cij == max_corr, arr.ind = TRUE)
pos_corr_variables <- rownames(cij)[pos_corr_indices[1, 1]]
pos_corr_variables2 <- colnames(cij)[pos_corr_indices[1, 2]]
cat("Q23. The two variables that are most positively correlated are:", pos_corr_variables, "and", pos_corr_variables2, "\n")
## Q23. The two variables that are most positively correlated are: chocolate and chocolate
pca <- prcomp(candy, scale = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0938 1.2127 1.13054 1.0787 0.98027 0.93656 0.81530
## Proportion of Variance 0.3372 0.1131 0.09832 0.0895 0.07392 0.06747 0.05113
## Cumulative Proportion 0.3372 0.4503 0.54866 0.6382 0.71208 0.77956 0.83069
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.78462 0.68466 0.66328 0.57829 0.43128 0.39534
## Proportion of Variance 0.04736 0.03606 0.03384 0.02572 0.01431 0.01202
## Cumulative Proportion 0.87804 0.91410 0.94794 0.97367 0.98798 1.00000
plot(pca$x[, 1], pca$x[, 2],
xlab = "PC1", ylab = "PC2",
main = "PCA Score Plot of PC1 vs PC2")
plot(pca$x[, 1:2], col = my_cols, pch = 16,
xlab = "PC1", ylab = "PC2",
main = "PCA Score Plot of PC1 vs PC2")
my_data <- cbind(candy, pca$x[, 1:3])
library(ggplot2)
p <- ggplot(my_data) +
aes(x = PC1, y = PC2,
size = winpercent / 100,
text = rownames(my_data),
label = rownames(my_data)) +
geom_point(col = my_cols)
print(p)
library(ggrepel)
p + geom_text_repel(size=3.3, col=my_cols, max.overlaps = 7) +
theme(legend.position = "none") +
labs(title="Halloween Candy PCA Space",
subtitle="Colored by type: chocolate bar (dark brown), chocolate other (light brown), fruity (red), other (black)",
caption="Data from 538")
## Warning: ggrepel: 56 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(p)
par(mar = c(8, 4, 2, 2))
barplot(pca$rotation[, 1], las = 2, ylab = "PC1 Contribution")
Q24. What original variables are picked up strongly by PC1 in the positive direction? Do these make sense to you?
loadings_pc1 <- pca$rotation[, 1]
strong_positive_loadings_pc1 <- names(loadings_pc1[loadings_pc1 > 0])
cat("Q24. Original variables picked up strongly by PC1 in the positive direction are:", paste(strong_positive_loadings_pc1, collapse = ", "), "\n")
## Q24. Original variables picked up strongly by PC1 in the positive direction are: fruity, hard, pluribus, win_to_price_ratio